#install.packages(c("ggplot2", "dplyr", "ggalluvial", "ggcorrplot", "ggridges", "reshape2", "plotly"))
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)
# Summarize data by biome1
hmmer_jr <- read.csv("hmmer_jre.csv")
viral_counts <- hmmer_jr %>%
group_by(biome1) %>%
summarise(
Viral = sum(genes_viral, na.rm = TRUE),
Plasmid = sum(genes_plasmid, na.rm = TRUE),
Microbial = sum(genes_microbial, na.rm = TRUE),
Unknown = sum(genes_unknown, na.rm = TRUE)
) %>% pivot_longer(cols = -biome1, names_to = "Category", values_to = "Count")
# Stacked bar chart
# Remove 'Unknown' category
viral_counts_filtered <- viral_counts %>%
filter(Category != "Unknown")
# Stacked bar chart without 'Unknown'
ggplot(viral_counts_filtered, aes(x = biome1, y = Count, fill = Category)) +
geom_bar(stat = "identity", position = "stack") +
theme_minimal() +
labs(
x = "Biome",
y = "Contig Count",
fill = "Category",
title = "Viral vs. Non-Viral Capsids Across Biomes (Filtered)"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) # Adjusts y-axis for better visibility

# Load necessary library
library(dplyr)
# Define T-number categories based on contig length
hmmer_jr <- hmmer_jr %>%
mutate(
T_number = case_when(
contig_length <= 5000 ~ "Mini",
contig_length > 5000 & contig_length <= 10000 ~ "T=1",
contig_length > 10000 & contig_length <= 12500 ~ "T≈2",
contig_length > 12500 & contig_length <= 15000 ~ "T=3",
contig_length > 15000 & contig_length <= 20000 ~ "T=4",
TRUE ~ "Unknown" # For any unexpected values
)
)
# Check if T_number is assigned properly
table(hmmer_jr$T_number)
##
## Mini T=1 T=3 T=4 T≈2
## 9 105 35 450 113
ggplot(hmmer_jr, aes(x = contig_length, y = genes_viral, color = T_number)) +
geom_point(alpha = 0.7, size = 3) +
geom_smooth(method = "lm", se = FALSE, color = "black") + # Trendline
scale_x_log10() +
theme_minimal() +
labs(
x = "Capsid Size (bp, Log Scale)",
y = "Number of Viral Genes",
color = "T-number",
title = "Capsid Size vs. Viral Gene Content"
)
## `geom_smooth()` using formula = 'y ~ x'

library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Count occurrences of each T-number in each biome
t_number_counts <- hmmer_jr %>%
count(biome1, T_number) %>%
pivot_wider(names_from = T_number, values_from = n, values_fill = 0)
# Convert to long format for heatmap
t_number_melt <- melt(t_number_counts, id.vars = "biome1")
# Heatmap
ggplot(t_number_melt, aes(x = biome1, y = variable, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "gray", high = "orange") +
theme_minimal() +
labs(
x = "Biome",
y = "T-number",
fill = "Frequency",
title = "T-number Distribution Across Biomes"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(hmmer_jr, aes(x = biome1, y = contig_length, fill = biome1)) +
geom_violin(alpha = 0.6) +
geom_boxplot(width = 0.2, color = "black") + # Add boxplot inside violin
scale_y_log10() +
theme_minimal() +
labs(
x = "Biome",
y = "Capsid Length (bp, Log Scale)",
fill = "Biome",
title = "Capsid Length Distributions by Biome"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Select relevant columns
capsid_data <- hmmer_jr %>%
select(T_number, genes_viral, genes_plasmid, genes_microbial, genes_unknown)
# Parallel coordinates plot
ggparcoord(capsid_data, columns = 2:5, groupColumn = 1, scale = "globalminmax") +
theme_minimal() +
labs(
x = "Genomic Features",
y = "Scaled Values",
title = "T-number vs. Genome Composition"
)

library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Aggregate counts
sunburst_data <- hmmer_jr %>%
count(biome1, biome2, T_number)
# Create sunburst plot
plot_ly(sunburst_data, labels = ~biome2, parents = ~biome1, values = ~n, type = 'sunburst')
print(sunburst_data)
## biome1 biome2 T_number n
## 1 Aquatic Freshwater Mini 8
## 2 Aquatic Freshwater T=1 4
## 3 Aquatic Freshwater T=3 6
## 4 Aquatic Freshwater T=4 66
## 5 Aquatic Freshwater T≈2 43
## 6 Aquatic Marine T=1 11
## 7 Aquatic Marine T=3 10
## 8 Aquatic Marine T=4 6
## 9 Aquatic Marine T≈2 25
## 10 Aquatic Non_marine saline T=1 15
## 11 Aquatic Non_marine saline T=3 3
## 12 Aquatic Non_marine saline T≈2 3
## 13 Aquatic Sediment T≈2 2
## 14 Engineered Bioreactor T=4 2
## 15 Engineered Solid waste T≈2 3
## 16 Engineered Wastewater T=1 2
## 17 Engineered Wastewater T=3 2
## 18 Engineered Wastewater T=4 5
## 19 Engineered Wastewater T≈2 4
## 20 Host_Associated Human Mini 1
## 21 Host_Associated Human T=1 71
## 22 Host_Associated Human T=3 4
## 23 Host_Associated Human T=4 356
## 24 Host_Associated Human T≈2 30
## 25 Host_Associated Mammals T=3 4
## 26 Host_Associated Mammals T=4 2
## 27 Host_Associated Mammals T≈2 1
## 28 Host_Associated Plants T=3 4
## 29 Host_Associated Plants T=4 1
## 30 Terrestrial Deep subsurface T=4 1
## 31 Terrestrial Deep subsurface T≈2 1
## 32 Terrestrial Soil T=1 2
## 33 Terrestrial Soil T=3 2
## 34 Terrestrial Soil T=4 6
## 35 Terrestrial Soil T≈2 1
## 36 <NA> <NA> T=4 5
ggplot(hmmer_jr, aes(x = contig_length, y = dtr_length, color = T_number)) +
geom_point(alpha = 0.7, size = 3) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
theme_minimal() +
labs(
x = "Capsid Length (bp)",
y = "DTR Length (bp)",
color = "T-number",
title = "DTR Length vs. Capsid Type"
)
## `geom_smooth()` using formula = 'y ~ x'
